rm(list = ls())
here::i_am(file.path("code", "19_naacp_match_rates.R"))
library(here)
source(here("code", "config.R"))

options(knitr.kable.NA = "-")

naacp <- read_parquet(here("data", "analysis", "naacp.parquet"))
naacp_matches <- read_parquet(here("data", "analysis", "naacp_census_matches.parquet"))

p <- naacp |>
  left_join(
    naacp_matches |> filter(census_year == "combined") |> select(naacp_id, census_id),
    by = "naacp_id"
  ) |>
  mutate(is_matched = ifelse(is.na(census_id), "No", "Yes")) |>
  count(year, is_matched) |>
  ggplot(aes(x = year, y = n, fill = is_matched)) +
  my_theme() +
  labs(
    fill = "Linked to census",
    y = "Number of records",
    x = "NAACP roster year"
  ) +
  xlim(NA, 1940)
  
p_color <- p + geom_col()
ggsave(file.path(fig_dir, "color", "naacp_roster_years.pdf"), plot = p_color, width = 6, height = 3.5)

p_bw <- p_color + geom_col(color = "black", linewidth = .3) + scale_fill_manual(values = c("No" = "#ddd", "Yes" = "black"))
ggsave(file.path(fig_dir, "bw", "naacp_roster_years.pdf"), plot = p_bw, width = 6, height = 3.5)

################################################################################
# Match rate by census year and method

bind_rows(
  naacp_matches |>
    select(naacp_id, census_year, match_type) |>
    unnest_longer(match_type) |>
    count(census_year, match_type) |>
    mutate(match_type = as.character(match_type)),
  naacp_matches |> count(census_year) |> mutate(match_type = "Any"),
  naacp_matches |> group_by(census_year) |> summarize(n = length(unique(census_id))) |> mutate(match_type = "Unique")
) |>
  mutate(p = n / length(unique(naacp$naacp_id))) |>
  pivot_wider(id_cols = match_type, names_from = census_year, values_from = c(n, p)) |>
  mutate(
    across(starts_with("n"), ~ formatC(.x, big.mark = ",")),
    across(starts_with("p"), ~ sprintf("%.3f", .x))
  ) |>
  select(
    match_type,
    n_1930,
    p_1930,
    n_1940,
    p_1940,
    n_combined,
    p_combined
  ) |>
  kable(
    col.names = linebreak(
      c("Link type", "Num.", "Prop.", "Num.", "Prop.", "Num.", "Prop."),
      align = "c"
    ),
    align = c("c", "c", "c", "c", "c", "c", "c"),
    booktabs = TRUE,
    linesep = linesep(7),
    format = "latex"
  ) |>
  kableExtra::add_header_above(c("", "1930" = 2, "1940" = 2, "Combined" = 2)) |>
  save_kable(file.path(tab_dir, "naacp_match_rates_by_type.tex"))

################################################################################
# Match rate by city

m <- naacp |>
  left_join(naacp_matches |> filter(census_year == "combined"), by = "naacp_id") |>
  mutate(match = !is.na(census_id))

overall <- m |>
  summarize(
    num_rows = n(),
    rate = mean(match),
    unique = length(na.omit(unique(census_id))),
    unique_rate = unique / num_rows
  ) |>
  mutate(city = "Overall")

cities <- m |>
  group_by(branch_id) |>
  summarize(
    num_rows = n(),
    rate = mean(match),
    unique = length(na.omit(unique(census_id))),
    unique_rate = unique / num_rows
  ) |>
  arrange(desc(num_rows))

city_xw <- read_csv(
  here("data", "xw", "naacp_cities.csv")
) |>
  mutate(
    city = ifelse(branch_id == "northern-ca", "Northern CA", city)
  )

bind_rows(
  overall,
  cities |>
    left_join(
      city_xw |> select(branch_id, city, state) |> distinct(),
      by = "branch_id"
    )
) |>
  head(31) |> 
  select(city, num_rows, rate, unique, unique_rate) |>
  mutate(
    across(c(num_rows, unique), \(x) formatC(x, big.mark = ",")),
    across(c(rate, unique_rate), \(x) formatC(x, format = "f", digits = 3))
  ) |>
  kable(
    col.names = linebreak(
      c("City", "Num.\nrows", "Row\nlinking rate", "Num. unique\nlinks", "Unique\nlinking rate"),
      align = "c"
    ),
    align = c("l", "c", "c", "c", "c"),
    booktabs = TRUE,
    linesep = linesep(c(1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)),
    format = "latex",
    escape = FALSE
  ) |>
  row_spec(row = 1, bold = TRUE) |>
  save_kable(file.path(tab_dir, "naacp_match_rates.tex"))
